{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 融合各个模型结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T02:17:12.229779Z",
     "start_time": "2020-06-22T02:17:11.154030Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.4.0'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import random\n",
    "import time\n",
    "import copy\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "tqdm.pandas()\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from torch.nn.utils.rnn import pack_padded_sequence, pack_sequence, pad_packed_sequence, pad_sequence\n",
    "from torch.utils.data import DataLoader, Dataset, SequentialSampler\n",
    "\n",
    "DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "torch.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T02:17:14.098058Z",
     "start_time": "2020-06-22T02:17:14.067213Z"
    }
   },
   "outputs": [],
   "source": [
    "# set random seeds to keep the results identical\n",
    "def setup_seed(seed):\n",
    "    torch.manual_seed(seed)\n",
    "    torch.cuda.manual_seed_all(seed)\n",
    "    np.random.seed(seed)\n",
    "    random.seed(seed)\n",
    "    torch.backends.cudnn.deterministic = True\n",
    "    \n",
    "GLOBAL_SEED = 2020\n",
    "setup_seed(GLOBAL_SEED)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T02:17:14.427135Z",
     "start_time": "2020-06-22T02:17:14.394838Z"
    }
   },
   "outputs": [],
   "source": [
    "data_path = './processed_data/'\n",
    "res_path = './result/'\n",
    "save_path = './processed_result'\n",
    "if not os.path.exists(save_path):\n",
    "    os.makedirs(save_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-22T02:17:37.689746Z",
     "start_time": "2020-06-22T02:17:16.243050Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.read_pickle(os.path.join(data_path, 'processed_data_numerical.pkl'))\n",
    "df['age'] = df['age'] - 1\n",
    "df['gender'] = df['gender'] - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['lstm_v8_300size_win40_5folds_1.4638.npy',\n",
       " '.ipynb_checkpoints',\n",
       " 'lstm_v10_300size_win30_10folds_1.4648.npy',\n",
       " 'lstm_v1_300size_win10_5folds_1.4634.npy',\n",
       " 'lstm_v11_128_128_10folds_1.4646.npy',\n",
       " 'attention_lstm_v1_128_128_5folds_1.4613.npy',\n",
       " 'lstm_v5_512size_win10_5folds_1.4624.npy',\n",
       " 'lstm_v4_128_128_5folds_1.4629.npy',\n",
       " 'lstm_v6_300size_win20_5folds_1.4642.npy',\n",
       " 'lstm_v2_300size_win10_dropout_5folds_1.4644.npy',\n",
       " 'submission.csv',\n",
       " 'lstm_v3_300size_win100_5folds_1.4624.npy',\n",
       " 'lstm_v9_300size_win50_5folds_1.4642.npy',\n",
       " 'lstm_v7_300size_win30_5folds_1.4642.npy']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.listdir(res_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_res(name):\n",
    "    res = np.load(os.path.join(res_path, name))\n",
    "    X_train = res[:3000000, :12]\n",
    "    y_train = res[:3000000, 12:]\n",
    "    X_test = res[3000000:, :12]\n",
    "    return X_train, y_train, X_test\n",
    "\n",
    "X_train_list = []\n",
    "y_train_list = []\n",
    "X_test_list = []\n",
    "select_res = [\n",
    " 'lstm_v10_300size_win30_10folds_1.4648.npy',\n",
    " 'lstm_v11_128_128_10folds_1.4646.npy',\n",
    " 'lstm_v6_300size_win20_5folds_1.4642.npy',\n",
    " 'lstm_v7_300size_win30_5folds_1.4642.npy',\n",
    " 'lstm_v8_300size_win40_5folds_1.4638.npy',\n",
    " 'lstm_v9_300size_win50_5folds_1.4642.npy',\n",
    " 'lstm_v1_300size_win10_5folds_1.4634.npy',\n",
    " 'lstm_v2_300size_win10_dropout_5folds_1.4644.npy',\n",
    " 'lstm_v3_300size_win100_5folds_1.4624.npy',\n",
    " 'lstm_v4_128_128_5folds_1.4629.npy',\n",
    " 'lstm_v5_512size_win10_5folds_1.4624.npy',\n",
    " 'attention_lstm_v1_128_128_5folds_1.4613.npy']\n",
    "for name in select_res:\n",
    "    X_train, y_train, X_test = load_res(name)\n",
    "    X_train_list.append(X_train)\n",
    "    y_train_list.append(y_train)\n",
    "    X_test_list.append(X_test)\n",
    "    \n",
    "X_train = np.stack(X_train_list)\n",
    "y_train = y_train_list[0]\n",
    "X_test = np.stack(X_test_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_age = X_test.mean(axis=0)[:, :10].argmax(axis=1)\n",
    "y_pred_gender = X_test.mean(axis=0)[:, 10:].argmax(axis=1)\n",
    "\n",
    "df_submit = df.iloc[3000000:, -2:].rename({'age': 'predicted_age', 'gender':'predicted_gender'}, axis=1)\n",
    "df_submit['predicted_age'] = y_pred_age + 1\n",
    "df_submit['predicted_gender'] = y_pred_gender + 1\n",
    "df_submit.to_csv(os.path.join(res_path, \"submission.csv\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>predicted_age</th>\n",
       "      <th>predicted_gender</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3000001</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3000002</th>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3000003</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3000004</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3000005</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3999996</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3999997</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3999998</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3999999</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4000000</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         predicted_age  predicted_gender\n",
       "user_id                                 \n",
       "3000001              3                 1\n",
       "3000002              7                 2\n",
       "3000003              2                 2\n",
       "3000004              3                 1\n",
       "3000005              4                 1\n",
       "...                ...               ...\n",
       "3999996              2                 1\n",
       "3999997              2                 1\n",
       "3999998              2                 1\n",
       "3999999              3                 1\n",
       "4000000              5                 1\n",
       "\n",
       "[1000000 rows x 2 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_submit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_pytorch_py3",
   "language": "python",
   "name": "conda_pytorch_py3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
